import os
import pandas as pd
import numpy as np
import pickle
import sys
from pandas import HDFStore,DataFrame
from datetime import date, datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import yaml
import math
import time
import matplotlib.pyplot as plt
import seaborn as sns
#from matplotlib_venn import venn2
cfg = None
#' load config.yaml file in the root dir
with open("../config.yaml", 'r') as ymlfile:
cfg = yaml.load(ymlfile, Loader=yaml.FullLoader)
# Use 3 decimal places in output display
pd.set_option("display.precision", 3)
# Don't wrap repr(DataFrame) across additional lines
pd.set_option("display.expand_frame_repr", False)
# Set max rows displayed in output to 25
pd.set_option("display.max_rows", 25)
pd.set_option('display.float_format', lambda x: '%.2f' % x)
product_level = 'Product Level 2'
colors = ['#1F77B4', '#FF7F0E', 'orangered']
path_to_hdf_datastore = '../'+cfg['path_to_hdf_datastore']
NA_VALUES_LIST = ['Unassigned', 'Unknown','nan', 'N.A', 'N.A.', 'NaN', 'Nan', '00-00-00', '0-00-00']
# customer ids cols
ECH_ecrid_col = 'ecrid'
journals_ecrid_col = 'SIS Id (Agreement SIS)'
other_ecrid_col = 'SIS Id (Agreement SIS)'
churn_activities_ecrid_col = 'ECR Id'
churn_risks_ecrid_col = 'Account Name: ECR Id'
account_assignment_ecrid_col = 'ECRID'
NPS_ecrid_col = 'ECR_ID'
usage_ecrid_col = 'ECR_ID'
interactions_ecrid_col = 'ECR_ID'
cancellations_ecrid_col = 'SIS Id (Agreement SIS)'
MERGE_ID = 'SIS Id (Agreement SIS)'
def get_data_frame_summary(data_frame):
unique_values = data_frame.apply(lambda x: [x.unique()])
unique_counts = data_frame.apply(lambda x: len(x.unique()))
na_counts = data_frame.apply(lambda x: sum(x.isna()))
percent_missing = data_frame.apply(lambda x: sum(pd.isnull(x))/len(x)*100)
data_type = data_frame.dtypes
return pd.DataFrame(dict(unique_values = unique_values,
unique_counts = unique_counts,
na_counts = na_counts,
data_type = data_type,
percent_missing = percent_missing,
)).reset_index().sort_values(by='percent_missing', ascending=False)
# Function to scale features of numeric columns
def scale_numeric_features(data_frame, exclude=[],
method='standardize',
inplace=False):
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
num_cols = data_frame.select_dtypes(include=numerics).columns
num_cols = num_cols.difference(exclude)
print(f'********************* - Scaling following {len(num_cols)} features - **********************')
for colname in num_cols:
new_colname = colname if inplace else colname+'_'+method+'d'
print(f' {colname} {method}d to {new_colname}')
if method == 'standardize':
data_frame[new_colname] = (data_frame[colname] - data_frame[colname].mean()) / data_frame[colname].std()
elif method == 'normalize':
data_frame[new_colname] = (data_frame[colname] - data_frame[colname].min()) / (data_frame[colname].max() - data_frame[colname].min())
else:
print(f'Unknown method {method} specified, please select one of "standardize" or "normalize"')
return data_frame
# number of agreements per ecr per product
def get_rfm_features_from_contracts(dataframe, groupcols):
dataframe = dataframe.groupby(
groupcols
).agg(bookings=("Bookings - Final Net Price - Agent Discount Amount(Rep)", sum),
mean_bookings = ("Bookings - Final Net Price - Agent Discount Amount(Rep)", 'mean'),
num_agrmts=('Agreement Number', pd.Series.nunique),
num_agrmts_with_parent = ('Parent Agreement Number', pd.Series.nunique),
last_agreement = ('Agreement Start Date', max),
first_agreement = ('Agreement Start Date', min)
).sort_values('bookings', ascending=False)
dataframe['last_agreement'] = pd.to_datetime(dataframe['last_agreement'], format='%Y-%m-%d')
dataframe['first_agreement'] = pd.to_datetime(dataframe['first_agreement'], format='%Y-%m-%d')
dataframe['days_since_last_agreement'] = dataframe['last_agreement'].apply(
lambda x: (datetime.today() - x).days
)
dataframe['days_since_first_agreement'] = dataframe['first_agreement'].apply(
lambda x: (datetime.today() - x).days
)
dataframe['length_of_relationship'] = (dataframe['days_since_first_agreement'] -
dataframe['days_since_last_agreement'])/365
dataframe['length_of_relationship'] = dataframe['length_of_relationship'].apply(math.ceil).clip(lower=1)
dataframe['bookings_per_year'] = dataframe['bookings'] / dataframe['length_of_relationship']
dataframe = dataframe.drop(['last_agreement', 'first_agreement'], axis=1)
return dataframe
def plot_2d_space(X, y, label='Classes'):
colors = ['#1F77B4', '#FF7F0E']
markers = ['o', 's']
for l, c, m in zip(np.unique(y), colors, markers):
plt.scatter(
X[y==l, 0],
X[y==l, 1],
c=c, label=l, marker=m
)
plt.title(label)
plt.legend(loc='upper right')
plt.show()
def encode_columns(df, columns_to_encode):
for column in columns_to_encode:
encoded_columns = pd.get_dummies(df[column])
print(f'Encoding columns : {column} to {len(encoded_columns.columns)} new encoded columns')
df = df.join(encoded_columns, rsuffix='_'+column).drop(column, axis=1)
return df
# Function to drop outliers of numeric columns
def drop_outliers(data_frame, exclude=[], include=[]):
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
num_cols = data_frame.select_dtypes(include=numerics).columns
if len(include) > 0:
num_cols = np.intersect1d(num_cols, include)
elif len (exclude) > 0:
num_cols = num_cols.difference(exclude)
data_frame['DROP'] = False
for colname in num_cols:
upper_lim = data_frame[colname].quantile(.95)
lower_lim = data_frame[colname].quantile(.05)
print(f'Dropping outliers for {colname} upper limit = {upper_lim} and lower limit = {lower_lim}')
#data_frame = data_frame[(data_frame[colname] < upper_lim) & (data_frame[colname] > lower_lim)]
data_frame['DROP'] = data_frame['DROP'] | (data_frame[colname] > upper_lim) | (data_frame[colname] < lower_lim)
data_frame = data_frame.loc[~data_frame['DROP']]
print(f'Rows Remaining {data_frame.shape[0]}')
data_frame = data_frame.drop(['DROP'], axis=1)
return data_frame
def get_trend_feature(df, new_colname, groupcols, timecols, prefix=''):
SECOND_PERIOD = 1
tempdf = df[groupcols + timecols]
tempdf = tempdf.groupby(groupcols).sum()
# increasing trend customers - - all customers with value True
increasing_cust = (tempdf.diff(axis=1) > 0)[timecols[SECOND_PERIOD:]].apply(all, axis=1)
increasing_cust_index = increasing_cust[increasing_cust].index
print("INCREASING TREND")
print(increasing_cust.value_counts())
# decreasing trend customers - all customers with value False
decreasing_cust = (tempdf.diff(axis=1) > 0)[timecols[SECOND_PERIOD:]].apply(any, axis=1)
decreasing_cust_index = decreasing_cust[~decreasing_cust].index
print("DECREASING TREND")
print(decreasing_cust.value_counts())
#df[new_colname] = 'updown_trend'
df.loc[:,new_colname] = prefix+'updown_trend'
df = df.set_index(groupcols)
df.loc[increasing_cust_index, new_colname] = prefix+'increasing_trend'
df.loc[decreasing_cust_index, new_colname] = prefix+'decreasing_trend'
df = df.reset_index()
return df
# Label Encoder
from sklearn import preprocessing
def encode_labels(df, cols_to_encode):
le_dict = {}
for col in cols_to_encode:
le = preprocessing.LabelEncoder()
le.fit(df[col])
df[col] = le.transform(df[col])
le_dict[col] = le
return df, le_dict
def print_counts(df):
print(df.shape)
try:
print(len(all_contracts_rfm['SIS Id (Agreement SIS)'].unique()))
except:
print('No customer col')
try:
print(len(all_contracts_rfm["Product Line Level 2"].unique()))
except:
print('No Product col')
#all_contracts = pd.read_pickle('../data/pickle/all_contracts_dev.pickle')
all_contracts = pd.read_pickle('../data/hdf/all_contracts_dev.pickle')
# Unique Customer x Product Level 2 Combinations
print(f'Unique Customers : {len(all_contracts["SIS Id (Agreement SIS)"].unique())}')
print(f'Unique Products at Level 2 : {len(all_contracts["Product Line Level 2"].unique())}')
print(f'Unique Customer Product Combinations : {len(all_contracts[["SIS Id (Agreement SIS)", "Product Line Level 2"]].drop_duplicates())}')
print(f' Total Bookings value all year in the data {all_contracts["Bookings - Final Net Price - Agent Discount Amount(Rep)"].sum()}')
all_contracts.groupby(
['Subscription Start Year']
).agg(bookings=("Bookings - Final Net Price - Agent Discount Amount(Rep)", sum)
).sort_values('bookings', ascending=False).reset_index().sort_values(
'Subscription Start Year').set_index('Subscription Start Year').T
ech = pd.read_hdf(path_to_hdf_datastore, cfg['ech_hdf_file'])
hierarchy = pd.read_hdf(path_to_hdf_datastore, cfg['hierarchy_file'])
active_cust_hierarchy = pd.merge(ech[['ecrid', 'Classification']] ,
hierarchy, left_on='ecrid', right_on='CHILD_ECR', how='inner')
active_cust_hierarchy = active_cust_hierarchy.drop(['ecrid', ], axis=1)
active_cust_hierarchy_agg = active_cust_hierarchy.groupby('PARENT_ECR').agg(
num_child = ('CHILD_ECR', pd.Series.nunique),
max_hier = ('HIER_LEVEL', max)
).sort_values('num_child', ascending=False)
active_cust_hierarchy_agg.head()
active_cust_hierarchy = pd.merge(active_cust_hierarchy, active_cust_hierarchy_agg,
on='PARENT_ECR')
active_cust_hierarchy.head()
all_contracts_rfm_cust = get_rfm_features_from_contracts(all_contracts,
groupcols =['SIS Id (Agreement SIS)', 'TYPE'])
all_contracts_rfm_cust = all_contracts_rfm_cust.rename(columns={"bookings": "total_bookings",
"mean_bookings": "total_mean_bookings",
"num_agrmts": "total_num_agrmts",
"num_agrmts_with_parent": "total_num_agrmts_with_parents",
"days_since_last_agreement": "total_days_since_last_agreement",
"days_since_first_agreement": "total_days_since_first_agreement",
"length_of_relationship": "total_length_of_relationship",
"bookings_per_year": "total_bookings_per_year"
}).reset_index()
print(f'Unique Customers : {len(all_contracts_rfm_cust["SIS Id (Agreement SIS)"].unique())}')
all_contracts_rfm_cust.head()
all_contracts_rfm_cust['total_length_of_relationship'].unique()
all_contracts_rfm_cust['over_million_year'] = all_contracts_rfm_cust['total_bookings_per_year'] > 1000000
all_contracts_rfm_cust['over_million_year'].value_counts()
all_contracts_rfm_cust_prod = get_rfm_features_from_contracts(all_contracts,
groupcols =['SIS Id (Agreement SIS)', 'Product Line Level 2', 'TYPE'])
all_contracts_rfm_cust_prod = all_contracts_rfm_cust_prod.rename(columns={"bookings": "prod_bookings",
"mean_bookings": "prod_mean_bookings",
"num_agrmts": "prod_num_agrmts",
"num_agrmts_with_parent": "prod_num_agrmts_with_parents",
"days_since_last_agreement": "prod_days_since_last_agreement",
"days_since_first_agreement": "prod_days_since_first_agreement",
"length_of_relationship": "prod_length_of_relationship",
"bookings_per_year": "prod_bookings_per_year"
}).reset_index()
all_contracts_rfm_cust_prod.head()
all_contracts_rfm = pd.merge(all_contracts_rfm_cust, all_contracts_rfm_cust_prod,
on=['SIS Id (Agreement SIS)','TYPE'], how='inner')
print(all_contracts_rfm_cust.shape)
print(all_contracts_rfm_cust_prod.shape)
print(all_contracts_rfm.shape)
del all_contracts_rfm_cust, all_contracts_rfm_cust_prod
all_contracts_rfm.head()
print_counts(all_contracts_rfm)
contracts_cust = pd.concat([pd.merge(all_contracts_rfm, active_cust_hierarchy,
left_on='SIS Id (Agreement SIS)', right_on='PARENT_ECR'),
pd.merge(all_contracts_rfm,active_cust_hierarchy,
left_on='SIS Id (Agreement SIS)', right_on='CHILD_ECR')])
print_counts(contracts_cust)
contracts_cust.head()
contracts_cust.columns
get_data_frame_summary(contracts_cust[['SIS Id (Agreement SIS)', 'CHILD_ECR', 'PARENT_ECR']])
print(len(pd.unique(contracts_cust[['SIS Id (Agreement SIS)', 'Product Line Level 2']].values.ravel('K'))))
print(len(pd.unique(contracts_cust[['PARENT_ECR', 'Product Line Level 2']].values.ravel('K'))))
print(len(pd.unique(contracts_cust[['CHILD_ECR', 'Product Line Level 2']].values.ravel('K'))))
# We keep one row per Customer X Product combination
contracts_cust = contracts_cust.drop_duplicates(['SIS Id (Agreement SIS)', 'Product Line Level 2'])
get_data_frame_summary(contracts_cust[['SIS Id (Agreement SIS)', 'CHILD_ECR', 'PARENT_ECR']])
contracts_cust.shape
The label CHURN is based on a cancelled contract, however from the workshop we discovered that CHURN lable can be calculated based on the downspin of booking value of customer for a given product
based on this logic we have the labels:
UPSOLD
LOST
TOTAL_CHURN
PARTIAL_CHURN
RETAINED
for every year,
We will attach these labels to our data.
#churn_label = pd.read_hdf(path_to_hdf_datastore, key='churn_label')
churn_label = pd.read_pickle('../data/hdf/churn_label_dev.pickle')
churn_year = 'churn_2018'
# Append churn lable to Customer x Product combinations
contracts_cust = pd.merge(contracts_cust, churn_label[['SIS Id (Agreement SIS)',
'Product Line Level 2', 'cust_booking_trend',
'cust_prod_booking_trend', churn_year]],
left_on=['SIS Id (Agreement SIS)','Product Line Level 2'],
right_on=['SIS Id (Agreement SIS)', 'Product Line Level 2'],
how='inner')
contracts_cust.shape
print(contracts_cust['SIS Id (Agreement SIS)'].isnull().value_counts(dropna=False))
print(contracts_cust['PARENT_ECR'].isnull().value_counts(dropna=False))
print(contracts_cust['CHILD_ECR'].isnull().value_counts(dropna=False))
print(len(pd.unique(contracts_cust[['SIS Id (Agreement SIS)', 'Product Line Level 2']].values.ravel('K'))))
print(len(pd.unique(contracts_cust[['PARENT_ECR', 'Product Line Level 2']].values.ravel('K'))))
print(len(pd.unique(contracts_cust[['CHILD_ECR', 'Product Line Level 2']].values.ravel('K'))))
print_counts(contracts_cust)
contracts_cust.head()
print(f"=========== {churn_year} ===========")
print(f' Churn figures for Customer X Product combinations in the year {churn_year}')
print(contracts_cust[churn_year].value_counts(dropna=False))
print(100. * contracts_cust[churn_year].value_counts(dropna=False) / len(contracts_cust[churn_year]))
# We Ignore customes that have Status LOST
# We classify Retained and Upsold as NOT CHURNED and
# we classify TOTAL_CHURN and PARTIAL_CHURN as
contracts_cust['CHURN_TYPE'] = contracts_cust[churn_year]
contracts_cust = contracts_cust[~contracts_cust['CHURN_TYPE'].isin(['LOST'])]
is_partial_churn = contracts_cust[churn_year].isin(['PARTIAL_CHURN'])
is_total_churn = contracts_cust[churn_year].isin(['TOTAL_CHURN'])
no_churn = contracts_cust[churn_year].isin(['UPSOLD', 'RETAINED'])
is_partial_churn.value_counts()
no_churn.value_counts()
If you wanted to do Binary classification you can lablel target variable as below
# is_churn = contracts_cust[churn_year].isin(['PARTIAL_CHURN', 'TOTAL_CHURN'])
# no_churn = contracts_cust[churn_year].isin(['UPSOLD', 'RETAINED'])
# is_churn.value_counts()
# no_churn.value_counts()
# contracts_cust.loc[is_churn, 'CHURN'] = 'YES'
# contracts_cust.loc[no_churn, 'CHURN'] = 'NO'
# print(f'=========== {churn_year} ===========')
# print(contracts_cust['CHURN'].value_counts(dropna=False))
# print(100. * contracts_cust['CHURN'].value_counts(dropna=False) / len(contracts_cust['CHURN']))
We will be using multiclass classification as below
contracts_cust.loc[is_partial_churn, 'CHURN_TYPE'] = 'PARTIAL'
contracts_cust.loc[is_total_churn, 'CHURN_TYPE'] = 'TOTAL'
contracts_cust.loc[no_churn, 'CHURN_TYPE'] = 'NONE'
print(f'=========== {churn_year} ===========')
print(contracts_cust['CHURN_TYPE'].value_counts(dropna=False))
print(100. * contracts_cust['CHURN_TYPE'].value_counts(dropna=False) / len(contracts_cust['CHURN_TYPE']))
contracts_cust = contracts_cust.drop([churn_year], axis=1)
We read SD Usage data for JES and JLS aggregated at ECR ID level. for custsomers with no JES or JLS subscription we will set usage as 0
#sd_usage = pd.read_hdf(path_to_hdf_datastore, key='sd_cust_prod_usage')
sd_usage = pd.read_pickle('../data/hdf/sd_cust_prod_usage.pickle')
sd_usage.head()
sd_usage['3_yr_usg_change'].value_counts(dropna=False)
contracts_cust_usage = pd.merge(contracts_cust, sd_usage.drop([2013,2014,2015,2016,2017,2018],axis=1),
left_on=[MERGE_ID,'Product Line Level 2'],
right_on=['ecr', 'Product Line Level 2'],
how='left')
contracts_cust_usage.drop(['ecr'], axis=1, inplace=True)
NAs introduced as jounals usage data is not available only for
contracts_cust_usage.isnull().apply(lambda x: x.value_counts())
contracts_cust_usage['jnl_usage_trend'] = contracts_cust_usage['jnl_usage_trend'].fillna('no_usage_data')
contracts_cust_usage['3_yr_usg_change'] = contracts_cust_usage['3_yr_usg_change'].astype(str).replace('nan', np.NaN)
contracts_cust_usage['3_yr_usg_change'] = contracts_cust_usage['3_yr_usg_change'].fillna('no_usage_data')
contracts_cust_usage['3_yr_usg_change'].value_counts(dropna=False)
# Fill nan values with zero
contracts_cust_usage = contracts_cust_usage.fillna(0)
contracts_cust_usage.isnull().apply(lambda x: x.value_counts())
contracts_cust_usage.jnl_usage_trend.value_counts()
sd_usage.head()
#sd_usage = pd.read_hdf(path_to_hdf_datastore, key='sd_usage')
#sd_usage.head()
#contracts_cust_usage = pd.merge(contracts_cust, sd_usage, left_on=MERGE_ID, right_on='ecr', how='left')
#contracts_cust_usage = contracts_cust_usage.drop(['ecr'], axis=1)
#contracts_cust_usage.isnull().apply(lambda x: x.value_counts())
*** Left join will introduce NaN in usage stats for customers with no usage details. we replace NaN with 0
#contracts_cust_usage = contracts_cust_usage.fillna(0)
#contracts_cust_usage.isnull().apply(lambda x: x.value_counts())
We read the web traffic data from Adobe Analytics to add web trafic metrics to model features
traffic = pd.read_hdf(path_to_hdf_datastore, key=cfg['usage_file'])
traffic['PROD_NAME'].value_counts(dropna=False)
traffic.replace(NA_VALUES_LIST, np.nan, inplace=True)
numeric_cols = ['ACT_CLICK_DEPTH', 'ACT_DWELL_TIME_VISIT_MIN',
'LOY_DWELL_TIME_USER_MIN', 'LOY_RETURN_RATE', 'POP_ACTIVE_USERS',
'POP_PAGE_VIEWS', 'POP_TIME_SPENT_HRS', 'POP_VISITS']
traffic[numeric_cols] = traffic[numeric_cols].astype(float)
traffic.head()
traffic.isnull().apply(lambda x: x.value_counts())
# Fill nan values with Mode
traffic = traffic.fillna(traffic.mode().iloc[0])
traffic.isnull().apply(lambda x: x.value_counts())
traffic['REPORT_DT'] = pd.to_datetime(traffic['REPORT_DT']).dt.strftime('%Y-%m-%d')
traffic['REPORT_DT'] = pd.to_datetime(traffic['REPORT_DT'], format='%Y-%m-%d')
traffic['REPORT_YEAR'] = traffic['REPORT_DT'].map(lambda x: x.year )
traffic['REPORT_MONTH'] = traffic['REPORT_DT'].map(lambda x: x.month )
traffic.head()
traffic_agg = traffic.groupby(['ECR_ID','PROD_NAME', 'REPORT_YEAR']).agg(
mean_visits=('POP_VISITS', 'mean'),
mean_time_spent=('POP_TIME_SPENT_HRS', 'mean'),
mean_page_views=('POP_PAGE_VIEWS', 'mean'),
mean_active_users=('POP_ACTIVE_USERS', 'mean')
)
traffic_agg
traffic_agg_visits = pd.pivot_table(traffic_agg,
index=['ECR_ID','PROD_NAME'],
columns=['REPORT_YEAR'],
values=['mean_visits'],
aggfunc=sum, fill_value=0, margins=False).reset_index()
traffic_agg_time = pd.pivot_table(traffic_agg.reset_index(),
index=['ECR_ID','PROD_NAME'],
columns=['REPORT_YEAR'],
values=['mean_time_spent'],
aggfunc=sum, fill_value=0, margins=False).reset_index()
traffic_agg_pageviews = pd.pivot_table(traffic_agg.reset_index(),
index=['ECR_ID','PROD_NAME'],
columns=['REPORT_YEAR'],
values=['mean_page_views'],
aggfunc=sum, fill_value=0, margins=False).reset_index()
traffic_agg_users = pd.pivot_table(traffic_agg.reset_index(),
index=['ECR_ID','PROD_NAME'],
columns=['REPORT_YEAR'],
values=['mean_active_users'],
aggfunc=sum, fill_value=0, margins=False).reset_index()
traffic_agg_visits.columns = pd.Index([(e[0] if not e[1] else e[1]) for e in traffic_agg_visits.columns.tolist()])
traffic_agg_time.columns = pd.Index([(e[0] if not e[1] else e[1]) for e in traffic_agg_time.columns.tolist()])
traffic_agg_pageviews.columns = pd.Index([(e[0] if not e[1] else e[1]) for e in traffic_agg_pageviews.columns.tolist()])
traffic_agg_users.columns = pd.Index([(e[0] if not e[1] else e[1]) for e in traffic_agg_users.columns.tolist()])
traffic_agg_visits
traffic_agg_visits = get_trend_feature(traffic_agg_visits, 'visits_trend',
groupcols = ['ECR_ID', 'PROD_NAME'],
timecols = [2016,2017,2018,2019],
prefix='visits_'
)
traffic_agg_time = get_trend_feature(traffic_agg_time, 'time_trend',
groupcols = ['ECR_ID', 'PROD_NAME'],
timecols = [2016,2017,2018,2019],
prefix='time_'
)
traffic_agg_pageviews = get_trend_feature(traffic_agg_pageviews, 'pageviews_trend',
groupcols = ['ECR_ID', 'PROD_NAME'],
timecols = [2016,2017,2018,2019],
prefix='pageviews_'
)
traffic_agg_users = get_trend_feature(traffic_agg_users, 'user_trend',
groupcols = ['ECR_ID', 'PROD_NAME'],
timecols = [2016,2017,2018,2019],
prefix='users_'
)
print('TRAFFIC VISITS TREND')
print(traffic_agg_visits['visits_trend'].value_counts(dropna=False))
print('TRAFFIC TIME SPENT TREND')
print(traffic_agg_time['time_trend'].value_counts(dropna=False))
print('TRAFFIC PAGE VIEWS TREND')
print(traffic_agg_pageviews['pageviews_trend'].value_counts(dropna=False))
print('TRAFFIC USER TREND')
print(traffic_agg_users['user_trend'].value_counts(dropna=False))
def pct_change(df, period1, period2):
if df[period2] == 0:
return 0
elif df[period1] == 0:
return df[period2]
else:
return (df[period2] - df[period1])*100 / df[period1]
traffic_agg_visits['visits_3_year_change'] = traffic_agg_visits.apply(lambda x: pct_change(x, 2017, 2019), axis=1)
traffic_agg_time['time_3_year_change'] = traffic_agg_time.apply(lambda x: pct_change(x, 2017, 2019), axis=1)
traffic_agg_pageviews['pageviews_3_year_change'] = traffic_agg_pageviews.apply(lambda x: pct_change(x, 2017, 2019), axis=1)
traffic_agg_users['users_3_year_change'] = traffic_agg_users.apply(lambda x: pct_change(x, 2017, 2019), axis=1)
traffic_agg_visits
traffic_agg_time
traffic_trends = pd.merge(traffic_agg_visits.drop([2016,2017,2018,2019], axis=1),
traffic_agg_time.drop([2016,2017,2018,2019], axis=1),
on=['ECR_ID','PROD_NAME']
)
traffic_trends = pd.merge(traffic_trends,
traffic_agg_pageviews.drop([2016,2017,2018,2019], axis=1),
on=['ECR_ID','PROD_NAME']
)
traffic_trends = pd.merge(traffic_trends,
traffic_agg_users.drop([2016,2017,2018,2019], axis=1),
on=['ECR_ID','PROD_NAME']
)
traffic_trends
contracts_cust_usage = pd.merge(contracts_cust_usage, traffic_trends,
left_on=['SIS Id (Agreement SIS)','Product Line Level 2'],
right_on=['ECR_ID', 'PROD_NAME'],
how='left'
)
contracts_cust_usage.drop(['ECR_ID', 'PROD_NAME'], axis=1, inplace=True)
*** Left join will introduce NaN in usage stats for customers with no usage details. we replace NaN with 0
contracts_cust_usage.isnull().apply(lambda x: x.value_counts())
# Fill nan values with Mode
contracts_cust_usage['visits_trend'] = contracts_cust_usage['visits_trend'].fillna('no_traffic_data')
contracts_cust_usage['time_trend'] = contracts_cust_usage['time_trend'].fillna('no_traffic_data')
contracts_cust_usage['pageviews_trend'] = contracts_cust_usage['pageviews_trend'].fillna('no_traffic_data')
contracts_cust_usage['user_trend'] = contracts_cust_usage['user_trend'].fillna('no_traffic_data')
contracts_cust_usage = contracts_cust_usage.fillna(0)
contracts_cust_usage.isnull().apply(lambda x: x.value_counts())
contracts_cust_usage['visits_trend'].value_counts()
contracts_cust_usage.head()
Role mapping of assigned person distributon over time
# Read Data
churn_activities = pd.read_hdf(path_to_hdf_datastore, cfg['churn_activities_hdf_file'])
churn_activities.head()
churn_activities['Product Name'].value_counts(dropna=True)
*** Ignore product as its empty. We aggregate chunr activities data at customer level
churn_activities_agg = churn_activities.groupby(
[churn_activities_ecrid_col, 'Activity Type', 'Assigned', 'Assigned Role']
).agg(num_activities=("Task", 'count')
).sort_values('num_activities', ascending=False).reset_index()
churn_activities_agg = churn_activities_agg.replace('nan',np.NaN)
churn_activities_agg[churn_activities_ecrid_col].value_counts(dropna=False)
churn_activities_agg = churn_activities_agg[~churn_activities_agg[churn_activities_ecrid_col].isnull()]
churn_activities_agg.head()
** Impute Activity Type
churn_activities_agg['Activity Type'].value_counts(dropna=False)
import random
nans = churn_activities_agg['Activity Type'].isna()
length = sum(nans)
replacement = random.choices(['Virtual', 'Face to Face','Phone','Online'], weights=[.4, .3, .2, .1], k=length)
churn_activities_agg.loc[nans,'Activity Type'] = replacement
churn_activities_agg['Activity Type'].value_counts(dropna=False)
# impute column with most frequent values
churn_activities_agg.isnull().apply(lambda x: x.value_counts())
churn_activities_agg = churn_activities_agg.fillna(churn_activities_agg.mode().iloc[0])
churn_activities_agg.isnull().apply(lambda x: x.value_counts())
churn_activities_agg.apply(lambda x: len(x.unique()))
churn_activities_agg = churn_activities_agg.groupby(['ECR Id', 'Activity Type']).sum().unstack(fill_value=0).reset_index()
churn_activities_agg.columns = ['_'.join(col).strip() for col in churn_activities_agg.columns.values]
churn_activities_agg.head()
contracts_cust_usage_activ = pd.merge(contracts_cust_usage, churn_activities_agg,
left_on=MERGE_ID, right_on='ECR Id_',
how='left')
contracts_cust_usage_activ.drop('ECR Id_', axis=1, inplace=True)
Check if null are introduced in any column
contracts_cust_usage_activ.isnull().apply(lambda x: x.value_counts())
contracts_cust_usage_activ = contracts_cust_usage_activ.fillna(churn_activities_agg.mode().iloc[0])
contracts_cust_usage_activ.isnull().apply(lambda x: x.value_counts())
#churn_risks = pd.read_hdf(path_to_hdf_datastore, cfg['churn_risks_hdf_file'])
churn_risks = pd.read_hdf(path_to_hdf_datastore, 'churn_risks_V02')
churn_risks_agg = churn_risks.groupby(
[churn_risks_ecrid_col, 'Severity']
).agg(num_risks=('Opportunity ID', 'count')
).sort_values('num_risks', ascending=False).reset_index()
churn_risks_agg = churn_risks_agg.replace(NA_VALUES_LIST,np.NaN)
del churn_risks
churn_risks_agg['Severity'].value_counts(dropna=False)
### Impute Severity proportionally
import random
nans = churn_risks_agg['Severity'].isna()
length = sum(nans)
replacement = random.choices(['Low', 'High','Medium','Critical'], weights=[.4, .3, .2, .1], k=length)
churn_risks_agg.loc[nans,'Severity'] = replacement
churn_risks_agg['Severity'].value_counts(dropna=False)
churn_risks_agg = churn_risks_agg.groupby(['Account Name: ECR Id', 'Severity']).sum().unstack(fill_value=0).reset_index()
churn_risks_agg.columns = ['_'.join(col).strip() for col in churn_risks_agg.columns.values]
churn_risks_agg
Merge with basetable
contracts_cust_usage_activ_risks = pd.merge(contracts_cust_usage_activ, churn_risks_agg,
left_on=MERGE_ID, right_on='Account Name: ECR Id_',
how='left')
contracts_cust_usage_activ_risks.drop('Account Name: ECR Id_', axis=1, inplace=True)
contracts_cust_usage_activ_risks.head()
contracts_cust_usage_activ_risks.isnull().apply(lambda x: x.value_counts())
contracts_cust_usage_activ_risks = contracts_cust_usage_activ_risks.fillna(contracts_cust_usage_activ_risks.mode().iloc[0])
contracts_cust_usage_activ_risks.isnull().apply(lambda x: x.value_counts())
interactions = pd.read_hdf(path_to_hdf_datastore, cfg['interactions_file'])
interactions = interactions.replace('nan',np.NaN)
interactions['CREATED_TO_CLOSED_DAYS'] = interactions['CREATED_TO_CLOSED_DAYS'].astype(float)
interactions['CREATED_TO_INITIAL_RESPONSE_DAYS'] = interactions['CREATED_TO_INITIAL_RESPONSE_DAYS'].astype(float)
interactions_agg = interactions.groupby(
[interactions_ecrid_col]
).agg(num_incidents=('INCIDENT_ID', 'count'),
mean_days_to_close=('CREATED_TO_CLOSED_DAYS', 'mean'),
max_days_to_close=('CREATED_TO_CLOSED_DAYS', 'max'),
max_days_to_initial_response=('CREATED_TO_INITIAL_RESPONSE_DAYS', 'max'),
mean_days_to_initial_response=('CREATED_TO_INITIAL_RESPONSE_DAYS', 'mean'),
num_owners=('OWNER_ID', pd.Series.nunique)
).sort_values('num_incidents', ascending=False).reset_index()
interactions_agg.head()
contracts_cust_usage_activ_risks_interaction = pd.merge(contracts_cust_usage_activ_risks, interactions_agg,
left_on=MERGE_ID, right_on=interactions_ecrid_col,
how='left')
contracts_cust_usage_activ_risks_interaction.drop(interactions_ecrid_col, axis=1, inplace=True)
contracts_cust_usage_activ_risks_interaction.isnull().apply(lambda x: x.value_counts())
contracts_cust_usage_activ_risks_interaction = contracts_cust_usage_activ_risks_interaction.fillna(
contracts_cust_usage_activ_risks_interaction.mode().iloc[0])
contracts_cust_usage_activ_risks_interaction.isnull().apply(lambda x: x.value_counts())
nps = pd.read_hdf(path_to_hdf_datastore, cfg['nps_file'])
nps.isnull().apply(lambda x: x.value_counts())
nps = nps.fillna(nps.mode().iloc[0])
nps.isnull().apply(lambda x: x.value_counts())
nps_agg = nps.groupby(
[NPS_ecrid_col]
).agg(num_nps=('NPS_SCORE', 'count'),
mean_nps=('NPS_SCORE', 'mean'),
min_nps=('NPS_SCORE', 'min'),
max_nps=('NPS_SCORE', 'max')
).sort_values('mean_nps', ascending=False).astype(int).reset_index()
get_data_frame_summary(nps_agg)
contracts_cust_usage_activ_risks_interaction_nps = pd.merge(contracts_cust_usage_activ_risks_interaction, nps_agg,
left_on=MERGE_ID, right_on=NPS_ecrid_col,
how='left')
contracts_cust_usage_activ_risks_interaction_nps.drop(NPS_ecrid_col, axis=1, inplace=True)
contracts_cust_usage_activ_risks_interaction_nps.isnull().apply(lambda x: x.value_counts())
contracts_cust_usage_activ_risks_interaction_nps = contracts_cust_usage_activ_risks_interaction_nps.fillna(
contracts_cust_usage_activ_risks_interaction_nps.mode().iloc[0])
contracts_cust_usage_activ_risks_interaction_nps.isnull().apply(lambda x: x.value_counts())
accounts_assignment = pd.read_hdf(path_to_hdf_datastore, cfg['account_assignment_file'])
# Replace NA Value with np.nan
accounts_assignment.replace(NA_VALUES_LIST, np.nan, inplace=True)
get_data_frame_summary(accounts_assignment)
AGENT
AM – Account Manager
CC – Customer Consultant
CCSD – Customer Consultant Science Direct
CMD – Customer Marketing Director
CMM – Customer Marketing Manager
RAD – Regional Account Director
RM – Regional Manager
RSSD – Research Solutions Sales Director
SD – Sales Director
SSM - Solutions Sales Manager
SSMCC – Solutions Sales Manager & Customer Consultant (double role)
accounts_assignment['ROLE'] = accounts_assignment['TERRITORY'].apply(lambda x: x.split("-")[0])
accounts_role = accounts_assignment[['ECRID', 'ROLE',
'TERRITORY_OWNER']].drop_duplicates().groupby(
['ECRID','ROLE']).count().unstack(fill_value=0).reset_index()
accounts_role
accounts_role.columns = pd.Index([e[0] + e[1] for e in accounts_role.columns.tolist()])
accounts_role
accounts_assignment.groupby('SIZE').agg(ECRS = ('ECRID', pd.Series.nunique)).sort_values(by='ECRS', ascending=False)
accounts_assignment['SIZE'].value_counts(dropna=False)
# Impute SIZE proportionally
import random
nans = accounts_assignment['SIZE'].isna()
length = sum(nans)
replacement = random.choices(['Medium (=Direct)', 'Small (=Telesales)',
'Large (=Key Accounts)','Very Small (=Agents)'], weights=[.5, .45, .025, .025], k=length)
accounts_assignment.loc[nans,'SIZE'] = replacement
accounts_assignment['SIZE'].value_counts(dropna=False)
TIER
The higher the tier (T1-RF being the highest) the more research-intensive the account is regarded to be, probably based on some analysis of how many scientific publications they generate on a yearly basis
# There are higg number of NaNs in Tier column we need to redistribute NaNs proportionally
accounts_assignment['TIER'].value_counts(dropna=False, normalize=True)
accounts_assignment.groupby('TIER').agg(ECRS = ('ECRID', pd.Series.nunique)).sort_values(by='ECRS', ascending=False)
nans = accounts_assignment['TIER'].isna()
length = sum(nans)
replacement = random.choices(['T4 - EF', 'T1 - RF',
'T2 - RE','T2 - RE'], weights=[.8, .1, .05, .05], k=length)
accounts_assignment.loc[nans,'TIER'] = replacement
accounts_assignment['TIER'].value_counts(dropna=False, normalize=True)
* ECR to Territory Owner have a many to many mapping and cannot be mapped one to one to the base table as we have one row per ECR X Product combination. we will map count of territory owners
accounts_assignment.groupby(['ECRID', 'LEVEL_15']).agg(
sales_reps = ('TERRITORY_OWNER', pd.Series.nunique)).sort_values(by='sales_reps', ascending=False)
accounts_assignment.groupby(['ECRID']).agg(
sales_reps = ('TERRITORY_OWNER', pd.Series.nunique)).sort_values(by='sales_reps', ascending=False)
accounts_assignment.groupby('TERRITORY_OWNER').agg(sales_reps = ('ECRID', pd.Series.nunique))
accounts_assignment.groupby(['ECRID']).agg(
tiers = ('TIER', pd.Series.nunique)).sort_values(by='tiers', ascending=False)
accounts_assignment.groupby(['ECRID']).agg(
sizes = ('SIZE', pd.Series.nunique)).sort_values(by='sizes', ascending=False)
# We need to drop some duplicates and keep one row per ECRID x TERRITORY_OWNER Combination
accounts_assignment_unique = accounts_assignment[['ECRID', 'SIZE', 'TIER']].drop_duplicates(['ECRID'], keep='first')
accounts_assignment_unique.head()
accounts_assignment_owners = accounts_assignment.groupby(['ECRID']).agg(
num_owners = ('TERRITORY_OWNER', pd.Series.nunique)).sort_values(by='num_owners', ascending=False).reset_index()
accounts_assignment_agg = pd.merge(accounts_assignment_unique, accounts_assignment_owners, on='ECRID', how='inner')
accounts_assignment_agg.head()
accounts_assignment_agg = pd.merge(accounts_assignment_agg, accounts_role, on='ECRID', how='inner')
accounts_assignment_agg
accounts_assignment_agg.isnull().apply(lambda x: x.value_counts())
contracts_cust_usage_activ_risks_interaction_nps_acc_assign = pd.merge(
contracts_cust_usage_activ_risks_interaction_nps, accounts_assignment_agg,
left_on=MERGE_ID, right_on='ECRID',
how='left')
contracts_cust_usage_activ_risks_interaction_nps_acc_assign.drop('ECRID', axis=1, inplace=True)
contracts_cust_usage_activ_risks_interaction_nps_acc_assign.isnull().apply(lambda x: x.value_counts())
# impute nans with mode
contracts_cust_usage_activ_risks_interaction_nps_acc_assign = contracts_cust_usage_activ_risks_interaction_nps_acc_assign.fillna(
contracts_cust_usage_activ_risks_interaction_nps_acc_assign.mode().iloc[0])
contracts_cust_usage_activ_risks_interaction_nps_acc_assign.isnull().apply(lambda x: x.value_counts())
keep_columns = ['Country', 'Indicator', '2015','2016','2017','2018', '2019']
gerd = pd.read_csv('../data/02. Data collection/05. Other/GERD.csv')
berd = pd.read_csv('../data/02. Data collection/05. Other/BERD.csv')
herd = pd.read_csv('../data/02. Data collection/05. Other/HERD.csv')
goverd = pd.read_csv('../data/02. Data collection/05. Other/GOVERD.csv')
gerd_trend = gerd.loc[gerd.Indicator != 'GERD', keep_columns]
gerd = gerd.loc[gerd.Indicator == 'GERD', keep_columns]
gerd = get_trend_feature(gerd, new_colname='gerd_trend', groupcols=['Country'] ,timecols= ['2015','2016','2017','2018', '2019'])
gerd_trend = gerd_trend[['Country', '2018']]
gerd_trend.columns = ['Country', 'gerd_yoy_change']
gerd_trend
gerd = gerd[['Country', '2018', 'gerd_trend']]
gerd.columns = ['Country', 'gerd_churn_year', 'gerd_trend']
gerd = pd.merge(gerd, gerd_trend, on='Country')
contracts_cust_usage_activ_risks_interaction_nps_acc_assign_gerd = pd.merge(contracts_cust_usage_activ_risks_interaction_nps_acc_assign,
gerd, left_on='COUNTRY_CHILD', right_on='Country',
how='left')
contracts_cust_usage_activ_risks_interaction_nps_acc_assign_gerd.drop(['Country'], axis=1, inplace=True)
# NAs introduced
contracts_cust_usage_activ_risks_interaction_nps_acc_assign_gerd.isnull().apply(lambda x: x.value_counts())
# impute with mode
contracts_cust_usage_activ_risks_interaction_nps_acc_assign_gerd = contracts_cust_usage_activ_risks_interaction_nps_acc_assign_gerd.fillna(
contracts_cust_usage_activ_risks_interaction_nps_acc_assign_gerd.mode().iloc[0])
contracts_cust_usage_activ_risks_interaction_nps_acc_assign_gerd.isnull().apply(lambda x: x.value_counts())
basetable_display = contracts_cust_usage_activ_risks_interaction_nps_acc_assign_gerd
# Delete Intermediate tables
del contracts_cust_usage_activ, contracts_cust_usage_activ_risks, contracts_cust_usage_activ_risks_interaction
del contracts_cust_usage_activ_risks_interaction_nps, contracts_cust_usage_activ_risks_interaction_nps_acc_assign
del contracts_cust_usage_activ_risks_interaction_nps_acc_assign_gerd
print_counts(basetable_display)
basetable_display
basetable_display['over_million_year'].replace({True: 'over_million_per_year', False: 'below_million_per_year'}, inplace=True)
basetable_display.to_pickle('../data/hdf/basetable_display.pickle')
We drop outliers for Booking value. to ensure that outliers dont skew our analysis
basetable_display = drop_outliers(basetable_display, include=['bookings'])
Country column has too many values with a long tail we apply binning to reduce the number of levels in the Country column. with binning lower frequency countries are binned into Others category.
basetable_display['COUNTRY_CHILD'].value_counts()
columns_to_bin = ['COUNTRY_CHILD']
LESS_THAN_FREQ = 100
# select columns with less than 100 labels
basetable_display = basetable_display.apply(lambda x: x.mask(x.map(x.value_counts())<LESS_THAN_FREQ, 'Other') if x.name in columns_to_bin else x)
is_journal = basetable_display.TYPE.isin(['JOURNALS'])
is_journal.value_counts(dropna=False)
basetable_jnl_display = basetable_display[is_journal]
basetable_sln_display = basetable_display[~is_journal]
basetable_jnl_display.TYPE.value_counts()
basetable_jnl_display.to_pickle('../data/hdf/basetable_jnl_display.pickle')
basetable_sln_display.to_pickle('../data/hdf/basetable_sln_display.pickle')
cols_to_drop = ['CHILD_ECR', 'PARENT_ECR', 'CHILD_NAME', 'PARENT_NAME', 'COUNTRY_PARENT', 'TYPE']
basetable_jnl_display = basetable_jnl_display.drop(cols_to_drop, axis=1)
basetable_sln_display = basetable_sln_display.drop(cols_to_drop, axis=1)
print(basetable_jnl_display['CHURN_TYPE'].value_counts(dropna=False))
print(basetable_sln_display['CHURN_TYPE'].value_counts(dropna=False))
basetable_jnl_display.select_dtypes(exclude=[np.number]).info()
def reformat_large_tick_values(tick_val, pos, prefix=''):
"""
Turns large tick values (in the billions, millions and thousands) such as 4500 into 4.5K and also appropriately turns 4000 into 4K (no zero after the decimal).
"""
if tick_val >= 1000000000:
val = round(tick_val/1000000000, 1)
new_tick_format = prefix+'{:}B'.format(val)
elif tick_val >= 1000000:
val = round(tick_val/1000000, 1)
new_tick_format = prefix+'{:}M'.format(val)
elif tick_val >= 1000:
val = round(tick_val/1000, 1)
new_tick_format = prefix+'{:}K'.format(val)
elif tick_val < 1000:
new_tick_format = round(tick_val, 1)
else:
new_tick_format = tick_val
# make new_tick_format into a string value
new_tick_format = str(new_tick_format)
# code below will keep 4.5M as is but change values such as 4.0M to 4M since that zero after the decimal isn't needed
index_of_decimal = new_tick_format.find(".")
if index_of_decimal != -1:
value_after_decimal = new_tick_format[index_of_decimal+1]
if value_after_decimal == "0":
# remove the 0 after the decimal point since it's not needed
new_tick_format = new_tick_format[0:index_of_decimal] + new_tick_format[index_of_decimal+2:]
return new_tick_format
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as tkr
color_dict = dict({'NONE':'green',
'PARTIAL':'orange',
'TOTAL':'red'
})
plt.figure(figsize=(15, 20))
plot_df = basetable_jnl_display[basetable_jnl_display['3_yr_usg_change_pct'] < 0]
g = sns.catplot(data=plot_df,
x='3_yr_usg_change_pct', y='CHURN_TYPE', hue='CHURN_TYPE',
palette=color_dict, legend='full', order=['NONE','PARTIAL','TOTAL'])
g.set_xticklabels(rotation=30)
for ax in g.axes[0]:
ax.xaxis.set_major_formatter(
tkr.FuncFormatter(reformat_large_tick_values))
basetable_jnl_display.CHURN_TYPE.value_counts().plot(kind='bar', title='Count (Churn)',
color=colors, figsize=(5,5));
A widely adopted technique for dealing with highly unbalanced datasets is called resampling. It consists of removing samples from the majority class (under-sampling) and / or adding more examples from the minority class (over-sampling).
Despite the advantage of balancing classes, these techniques also have their weaknesses (there is no free lunch). The simplest implementation of over-sampling is to duplicate random records from the minority class, which can cause overfitting. In under-sampling, the simplest technique involves removing random records from the majority class, which can cause loss of information.
We implement a basic example, which uses the DataFrame.sample method to get random samples each class:
# Class count
count_class_NONE, count_class_PARTIAL, count_class_TOTAL = basetable_jnl_display.CHURN_TYPE.value_counts()
# Divide by class
df_class_NONE = basetable_jnl_display[basetable_jnl_display['CHURN_TYPE'] == 'NONE']
df_class_PARTIAL = basetable_jnl_display[basetable_jnl_display['CHURN_TYPE'] == 'PARTIAL']
df_class_TOTAL = basetable_jnl_display[basetable_jnl_display['CHURN_TYPE'] == 'TOTAL']
df_class_NONE_under = df_class_NONE.sample(count_class_TOTAL)
df_class_PARTIAL_under = df_class_PARTIAL.sample(count_class_TOTAL)
df_test_under = pd.concat([df_class_NONE_under, df_class_PARTIAL_under, df_class_TOTAL], axis=0)
print('Random under-sampling:')
print(df_test_under.CHURN_TYPE.value_counts())
df_test_under.CHURN_TYPE.value_counts().plot(kind='bar', title='Count (label)',
color=colors, figsize=(5,5));
df_class_PARTIAL_over = df_class_PARTIAL.sample(count_class_NONE, replace=True)
df_class_TOTAL_over = df_class_TOTAL.sample(count_class_NONE, replace=True)
df_test_over = pd.concat([df_class_NONE, df_class_PARTIAL_over, df_class_TOTAL_over], axis=0)
print('Random over-sampling:')
print(df_test_over.CHURN_TYPE.value_counts())
df_test_over.CHURN_TYPE.value_counts().plot(kind='bar', title='Count (label)',
color=colors, figsize=(5,5));
# We work with random oversampling as this gives us more samples to train data
modeltable_jnl_display = df_test_over
modeltable_jnl_display = modeltable_jnl_display.set_index(['SIS Id (Agreement SIS)', 'Product Line Level 2'])
We keep a dataframe for display as this helps to have a easily explainable features, at the same time for model development we want to have numeric variables so we encode categorical varibles for model development.
modeltable, le_dict = encode_labels(modeltable_jnl_display.copy(), ['over_million_year', 'Classification',
'CONSORTIUM', 'HIERARCHY_TYPE',
'cust_booking_trend', 'cust_prod_booking_trend',
'CHURN_TYPE','COUNTRY_CHILD',
'jnl_usage_trend', '3_yr_usg_change',
'visits_trend', 'time_trend',
'pageviews_trend','user_trend',
'SIZE', 'TIER','gerd_trend'
])
Here we can see that our target variable CHURN_TYPE is encoded as below:
0: NONE
1: PARTIAL
2: TOTAL
le_dict['CHURN_TYPE'].inverse_transform([0,1,2])
pd.set_option("display.max_rows", 90)
get_data_frame_summary(modeltable_jnl_display)
get_data_frame_summary(modeltable)
feature_names = np.setdiff1d(modeltable.columns, ['CHURN_TYPE'])
print(len(feature_names))
feature_names
modeltable[feature_names].describe()
rows = np.random.randint(2, size=len(modeltable)).astype('bool')
#sample = modeltable[rows]
#sample_display = modeltable_jnl_display[rows]
sample = modeltable
sample_display = modeltable_jnl_display
sample.shape
sample_display.shape
sample['CHURN_TYPE'].value_counts()
Here we will first plot the Pearson correlation heatmap and see the correlation of independent variables with the output variable CHURN.
We will only select features that have correlation of above 0.5 (taking absolute value) with the output variable.
The correlation coefficient has values between -1 to 1
— A value closer to 0 implies weaker correlation (exact 0 implying no correlation)
— A value closer to 1 implies stronger positive correlation
— A value closer to -1 implies stronger negative correlation
#Using Pearson Correlation
plt.figure(figsize=(30,20))
cor = sample.corr()
sns.heatmap(cor, annot=False, cmap=plt.cm.Reds)
plt.show()
# As these are Journals tere is no webtraffic data and hence you see white gaps
sample = sample.drop(['user_trend','users_3_year_change',
'visits_trend','visits_3_year_change',
'time_trend','time_3_year_change',
'pageviews_trend','pageviews_3_year_change'
], axis=1)
cor = sample.corr()
#Correlation with output variable
cor_target = abs(cor["CHURN_TYPE"])
#Selecting highly correlated features
relevant_features = cor_target[cor_target > 0.2]
relevant_features
Keep only one variable if two variable are highly correlated with each other (>0.75)
TRESHOLD = 0.75
columns = np.full((cor.shape[0],), True, dtype=bool)
for i in range(cor.shape[0]):
for j in range(i+1, cor.shape[0]):
if cor.iloc[i,j] >= TRESHOLD:
if columns[j]:
columns[j] = False
selected_columns = sample.columns[columns]
sample = sample[selected_columns]
sample_display = sample_display[sample.columns]
sample.columns
#Using Pearson Correlation
plt.figure(figsize=(30,20))
cor = sample.corr()
sns.heatmap(cor, annot=False, cmap=plt.cm.Reds)
plt.show()
X = sample.loc[:, sample.columns != 'CHURN_TYPE']
y = sample['CHURN_TYPE']
X_display = sample_display.loc[:, sample_display.columns != 'CHURN_TYPE']
y_display = sample_display['CHURN_TYPE']
X.columns
import xgboost
import shap
import matplotlib.pylab as pl
from sklearn.model_selection import train_test_split
shap.initjs()
# create a train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)
d_train = xgboost.DMatrix(X_train, label=y_train)
d_test = xgboost.DMatrix(X_test, label=y_test)
Check the target variable
y_test
Simple model without tuned parameters
params = {
'max_depth': 6,
'objective': 'multi:softmax', # error evaluation for multiclass training
'num_class': 3,
'n_gpus': 0
}
bst = xgboost.train(params, d_train)
pred = bst.predict(d_test)
pd.Series(pred).value_counts()
y_test.value_counts()
from sklearn.metrics import classification_report
print(classification_report(y_test, pred))
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, pred)
cm
def plot_confusion_matrix(cm, classes, normalized=True, cmap='bone'):
plt.figure(figsize=[7, 6])
norm_cm = cm
if normalized:
norm_cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
sns.heatmap(norm_cm, annot=cm, fmt='g', xticklabels=classes, yticklabels=classes, cmap=cmap)
#plt.savefig('confusion-matrix.png')
plot_confusion_matrix(cm, ['NONE', 'PARTIAL', 'TOTAL'])
We further tune the model by varying the hyperparameters of our model to ensure that we have a generalized model that is not overfitting the data
learning_rate:
max_depth: Used to control over-fitting as higher depth will allow model to learn relations very specific to a particular sample
n_estimators:
reg_lambda: L2 regularization term on weights, this is used to reduce overfitting
gamma: Makes the algorithm conservative. A node is split only when the resulting split gives a positive reduction in the loss function. Gamma specifies the minimum loss reduction required to make a split
random_state: The random number seed.
# Tuned Model
xgbc = xgboost.XGBClassifier(learning_rate=0.3,
n_estimators=120,
max_depth=3,
min_child_weight=0,
gamma=0,
reg_lambda=1,
subsample=1,
colsample_bytree=0.75,
scale_pos_weight=1,
objective='multi:softprob',
num_class=3,
random_state=42)
start_time = time.time()
model = xgbc.fit(X_train, y_train, eval_metric='mlogloss')
print(f'Model training completed in {round((time.time() - start_time)/60, 2)} mins for {len(X_train)} samples')
pred = model.predict(X_test)
proba = model.predict_proba(X_test)
from sklearn.metrics import classification_report
print(classification_report(y_test, pred))
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, pred)
cm
def plot_confusion_matrix(cm, classes, normalized=True, cmap='bone'):
plt.figure(figsize=[7, 6])
norm_cm = cm
if normalized:
norm_cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
sns.heatmap(norm_cm, annot=cm, fmt='g', xticklabels=classes, yticklabels=classes, cmap=cmap)
#plt.savefig('confusion-matrix.png')
plot_confusion_matrix(cm, ['NONE', 'PARTIAL', 'TOTAL'])
pd.Series(pred).value_counts()
y_test.value_counts()
proba
proba.max(axis=1)
Feature Importance by Weights
The number of times a feature is used to split the data across all trees.
pl.rcParams['figure.figsize'] = [20, 20]
xgboost.plot_importance(model)
pl.title("xgboost.plot_importance(model)")
pl.show()
We see above that the following features are important by weights
1- prod_mean_Bookings
2- prod_bookings
3- prod_bookings_per_year
4- total_mean_bookings
5- prod_days_since_last_agreement
Feature Importance by Cover
The number of times a feature is used to split the data across all trees weighted by the number of training data points that go through those splits.
xgboost.plot_importance(model, importance_type="cover")
We see above that the following features are important by cover
1- TERRIOTRY_OWNERAGENT
2- cust_prod_booking_trend
3- visits_trend
4- visits_3_year_change
5- users_3_year_change
Feature Importance by Gain
The average training loss reduction gained when using a feature for splitting.
xgboost.plot_importance(model, importance_type="gain")
pl.title('xgboost.plot_importance(model, importance_type="gain")')
pl.show()
We see above that the following features are important by gain
# this takes a minute or two since we are explaining over 30 thousand samples in a model with over a thousand trees
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)
Visualize a single prediction Note that we use the "display values" data frame so we get nice strings instead of category codes.
for which_class in range(0,3):
display(shap.force_plot(explainer.expected_value[which_class], shap_values[which_class][1], X_display.iloc[1,:]))
Visualize many predictions To keep the browser happy we only visualize 1,000 individuals.
shap.force_plot(explainer.expected_value[1], shap_values[1][:1000,:], X_display.iloc[:1000,:])
Bar chart of mean importance This takes the average of the SHAP value magnitudes across the dataset and plots it as a simple bar chart.
from matplotlib import colors as plt_colors
# class names
classes = ['NONE', 'PARTIAL', 'TOTAL']
# set RGB tuple per class
colors = [(0, 0.5, 1), (1, 0.7, 0), (1, 0.5, 0)]
# get class ordering from shap values
class_inds = np.argsort([-np.abs(shap_values[i]).mean() for i in range(len(shap_values))])
# create listed colormap
cmap = plt_colors.ListedColormap(np.array(colors)[class_inds])
# plot
shap.summary_plot(shap_values, X_display, color=cmap, class_names=classes)
Variable most significat for predicting class 2 i.e. Total Churn
shap.summary_plot(shap_values[2], X)
Variable most significat for predicting class 1 i.e. Partial Churn
shap.summary_plot(shap_values[1], X)
SHAP Summary Plot Rather than use a typical feature importance bar chart, we use a density scatter plot of SHAP values for each feature to identify how much impact each feature has on the model output for records (i.e. customer x product combinations) in the validation dataset. Features are sorted by the sum of the SHAP value magnitudes across all samples.
Note that when the scatter points don't fit on a line they pile up to show density, and the color of each point represents the feature value of that individual.
We keep the top 5 predictive variales in order to simplify the model
keep_features = ['total_num_agrmts', 'total_days_since_last_agreement', 'cust_prod_booking_trend', 'total_bookings',
'3_yr_mean_usage', '3_yr_usg_change_pct', 'gerd_churn_year', 'CHURN_TYPE']
simple_sample = sample[keep_features]
simple_sample_display = sample_display[keep_features]
X = simple_sample.loc[:, simple_sample.columns != 'CHURN_TYPE']
y = simple_sample['CHURN_TYPE']
X_display = simple_sample_display.loc[:, simple_sample_display.columns != 'CHURN_TYPE']
y_display = simple_sample_display['CHURN_TYPE']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)
xgbc = xgboost.XGBClassifier(learning_rate=0.5,
n_estimators=120,
max_depth=3,
min_child_weight=0,
gamma=0,
reg_lambda=1,
subsample=1,
colsample_bytree=0.75,
scale_pos_weight=1,
objective='multi:softprob',
num_class=3,
random_state=42)
start_time = time.time()
model = xgbc.fit(X_train, y_train, eval_metric='mlogloss')
print(f'Model training completed in {round((time.time() - start_time)/60, 2)} mins for {len(X_train)} samples')
pred = model.predict(X_test)
proba = model.predict_proba(X_test)
simple_sample_display['CHURN_TYPE'].value_counts()
from sklearn.metrics import classification_report
print(classification_report(y_test, pred))
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, pred)
cm
def plot_confusion_matrix(cm, classes, normalized=True, cmap='bone'):
plt.figure(figsize=[7, 6])
norm_cm = cm
if normalized:
norm_cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
sns.heatmap(norm_cm, annot=cm, fmt='g', xticklabels=classes, yticklabels=classes, cmap=cmap)
#plt.savefig('confusion-matrix.png')
plot_confusion_matrix(cm, ['NONE', 'PARTIAL', 'TOTAL'])
pd.Series(pred).value_counts()
y_test.value_counts()
# this takes a minute or two since we are explaining over 30 thousand samples in a model with over a thousand trees
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)
X.columns
shap_df = pd.DataFrame(np.concatenate(shap_values))
shap_df.columns = X.columns
shap_df
Visualize a single prediction Note that we use the "display values" data frame so we get nice strings instead of category codes.
You can visualise how every feature and its values affects the prediction how the feature is pushing it higher or lower and the size of effect
shap.force_plot(explainer.expected_value[0], shap_values[0][1], X_display.iloc[1,:])
Prediction explainer for each class
for which_class in range(0,3):
display(shap.force_plot(explainer.expected_value[which_class], shap_values[which_class][1], X_display.iloc[1,:]))
#Display all features and SHAP values
# df1=pd.DataFrame(data=shap_values[0][1].tolist(), columns=X.columns, index=[0])
# df2=pd.DataFrame(data=shap_values[1][1].tolist(), columns=X.columns, index=[1])
# df3=pd.DataFrame(data=shap_values[2][1].tolist(), columns=X.columns, index=[2])
# df=pd.concat([df1,df2,df3])
# display(df.transpose())
Visualize many predictions To keep the browser happy we only visualize 1,000 individuals.
shap.force_plot(explainer.expected_value[2], shap_values[2][:1000,:], X_display.iloc[:1000,:])
Bar chart of mean importance This takes the average of the SHAP value magnitudes across the dataset and plots it as a simple bar chart.
from matplotlib import colors as plt_colors
# class names
classes = ['NONE', 'PARTIAL', 'TOTAL']
# set RGB tuple per class
colors = [(0, 0.5, 1), (1, 0.7, 0), (1, 0.5, 0)]
# get class ordering from shap values
class_inds = np.argsort([-np.abs(shap_values[i]).mean() for i in range(len(shap_values))])
# create listed colormap
cmap = plt_colors.ListedColormap(np.array(colors)[class_inds])
# plot
shap.summary_plot(shap_values, X_display, color=cmap, class_names=classes)
Variable most significat for predicting class 2 i.e. Total Churn
shap.summary_plot(shap_values[2], X)
Variable most significat for predicting class 1 i.e. Partial Churn
shap.summary_plot(shap_values[1], X)
keep_features.remove('CHURN_TYPE')
for this_feature in keep_features:
# print(f' SHAP partial dependence plot for {this_feature} on NO_CHURN')
# shap.dependence_plot(this_feature, shap_values[0], X, display_features=X_display)
print(f' SHAP partial dependence plot for {this_feature} on TOTAL_CHURN')
shap.dependence_plot(this_feature, shap_values[2], X, display_features=X_display)
print(f' SHAP partial dependence plot for {this_feature} on PARTIAL_CHURN')
shap.dependence_plot(this_feature, shap_values[1], X, display_features=X_display)
# def wrapped_fn(one+):
# print(example.shape)
# df = pd.DataFrame(data=example.reshape(1,-1),
# columns=X_test.columns)
# pp = model.predict_proba(df)
# print(pp)
# return np.hstack(pp)
# pp = model.predict_proba(X_test.iloc[[1]])
# print(pp)
# np.hstack(pp)
# xgb_prediction(X_test.iloc[1,:])
# explainer_lime = LimeTabularExplainer(X_train.to_numpy(),
# feature_names=X_train.columns.tolist(),
# categorical_features=X_train.cust_prod_booking_trend,
# categorical_names=['cust_prod_booking_trend'],
# class_names=[0,1,2], mode='classification',
# training_labels=y_train)
# Serialise and save model
pickle.dump(model, open("../models/journals_xgb.pickle.dat", "wb"))
basetable_sln_display.CHURN_TYPE.value_counts().plot(kind='bar', title='Count (Churn)',
color=colors, figsize=(5,5));
A widely adopted technique for dealing with highly unbalanced datasets is called resampling. It consists of removing samples from the majority class (under-sampling) and / or adding more examples from the minority class (over-sampling).
Despite the advantage of balancing classes, these techniques also have their weaknesses (there is no free lunch). The simplest implementation of over-sampling is to duplicate random records from the minority class, which can cause overfitting. In under-sampling, the simplest technique involves removing random records from the majority class, which can cause loss of information.
We implement a basic example, which uses the DataFrame.sample method to get random samples each class:
# Class count
count_class_NONE, count_class_PARTIAL, count_class_TOTAL = basetable_sln_display.CHURN_TYPE.value_counts()
# Divide by class
df_class_NONE = basetable_sln_display[basetable_sln_display['CHURN_TYPE'] == 'NONE']
df_class_PARTIAL = basetable_sln_display[basetable_sln_display['CHURN_TYPE'] == 'PARTIAL']
df_class_TOTAL = basetable_sln_display[basetable_sln_display['CHURN_TYPE'] == 'TOTAL']
df_class_NONE_under = df_class_NONE.sample(count_class_TOTAL)
df_class_PARTIAL_under = df_class_PARTIAL.sample(count_class_TOTAL)
df_test_under = pd.concat([df_class_NONE_under, df_class_PARTIAL_under, df_class_TOTAL], axis=0)
print('Random under-sampling:')
print(df_test_under.CHURN_TYPE.value_counts())
df_test_under.CHURN_TYPE.value_counts().plot(kind='bar', title='Count (label)',
color=colors, figsize=(5,5));
df_class_PARTIAL_over = df_class_PARTIAL.sample(count_class_NONE, replace=True)
df_class_TOTAL_over = df_class_TOTAL.sample(count_class_NONE, replace=True)
df_test_over = pd.concat([df_class_NONE, df_class_PARTIAL_over, df_class_TOTAL_over], axis=0)
print('Random over-sampling:')
print(df_test_over.CHURN_TYPE.value_counts())
df_test_over.CHURN_TYPE.value_counts().plot(kind='bar', title='Count (label)',
color=colors, figsize=(5,5));
# We work with random oversampling as this gives us more samples to train data
modeltable_sln_display = df_test_over
modeltable_sln_display = modeltable_sln_display.set_index(['SIS Id (Agreement SIS)', 'Product Line Level 2'])
modeltable, le_dict = encode_labels(modeltable_sln_display.copy(), ['over_million_year', 'Classification',
'CONSORTIUM', 'HIERARCHY_TYPE',
'cust_booking_trend', 'cust_prod_booking_trend',
'CHURN_TYPE','COUNTRY_CHILD',
'jnl_usage_trend', '3_yr_usg_change',
'visits_trend', 'time_trend',
'pageviews_trend','user_trend',
'SIZE', 'TIER', 'gerd_trend'
])
# basetable = encode_columns(basetable, ['over_million_year', 'CONSORTIUM', 'HIERARCHY_TYPE'
# 'Classification', 'cust_trend', 'prod_trend',
# 'SIZE', 'TIER',
# 'jnl_usage_trend', 'visits_trend', 'time_trend', 'pageviews_trend','user_trend'
# ])
pd.set_option("display.max_rows", 90)
get_data_frame_summary(modeltable_sln_display)
get_data_frame_summary(modeltable)
# feature_names = ['bookings', 'num_contracts', 'num_parents', 'days_since_last_agreement', 'length_of_relationship',
# 'mean_visits', 'mean_time_spent', 'mean_page_views','mean_active_users']
feature_names = np.setdiff1d(modeltable.columns, ['CHURN_TYPE'])
print(len(feature_names))
feature_names
modeltable[feature_names].describe()
modeltable.shape
rows = np.random.randint(2, size=len(modeltable)).astype('bool')
#sample = modeltable[rows]
#sample_display = modeltable_jnl_display[rows]
sample = modeltable
sample_display = modeltable_sln_display
sample.shape
sample_display.shape
sample['CHURN_TYPE'].value_counts()
Here we will first plot the Pearson correlation heatmap and see the correlation of independent variables with the output variable CHURN. We will only select features that have correlation of above 0.5 (taking absolute value) with the output variable. The correlation coefficient has values between -1 to 1 — A value closer to 0 implies weaker correlation (exact 0 implying no correlation) — A value closer to 1 implies stronger positive correlation — A value closer to -1 implies stronger negative correlation
#Using Pearson Correlation
plt.figure(figsize=(30,20))
cor = sample.corr()
sns.heatmap(cor, annot=False, cmap=plt.cm.Reds)
plt.show()
# As these are Solutions tere is no usage data and hence you see white gaps
sample = sample.drop(['jnl_usage_trend','3_yr_mean_usage',
'3_yr_usg_change_pct','3_yr_usg_change'
], axis=1)
cor = sample.corr()
#Correlation with output variable
cor_target = abs(cor["CHURN_TYPE"])
#Selecting highly correlated features
relevant_features = cor_target[cor_target > 0.2]
relevant_features
TRESHOLD = 0.75
columns = np.full((cor.shape[0],), True, dtype=bool)
for i in range(cor.shape[0]):
for j in range(i+1, cor.shape[0]):
if cor.iloc[i,j] >= TRESHOLD:
if columns[j]:
columns[j] = False
selected_columns = sample.columns[columns]
sample = sample[selected_columns]
sample_display = sample_display[sample.columns]
#Using Pearson Correlation
plt.figure(figsize=(30,20))
cor = sample.corr()
sns.heatmap(cor, annot=False, cmap=plt.cm.Reds)
plt.show()
X = sample.loc[:, sample.columns != 'CHURN_TYPE']
y = sample['CHURN_TYPE']
X_display = sample_display.loc[:, sample_display.columns != 'CHURN_TYPE']
y_display = sample_display['CHURN_TYPE']
X.columns
import xgboost
import shap
import matplotlib.pylab as pl
from sklearn.model_selection import train_test_split
shap.initjs()
# create a train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)
d_train = xgboost.DMatrix(X_train, label=y_train)
d_test = xgboost.DMatrix(X_test, label=y_test)
y_test
Simple model without tuned parameters
params = {
'max_depth': 6,
'objective': 'multi:softmax', # error evaluation for multiclass training
'num_class': 3,
'n_gpus': 0
}
bst = xgboost.train(params, d_train)
pred = bst.predict(d_test)
pd.Series(pred).value_counts()
y_test.value_counts()
from sklearn.metrics import classification_report
print(classification_report(y_test, pred))
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, pred)
cm
def plot_confusion_matrix(cm, classes, normalized=True, cmap='bone'):
plt.figure(figsize=[7, 6])
norm_cm = cm
if normalized:
norm_cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
sns.heatmap(norm_cm, annot=cm, fmt='g', xticklabels=classes, yticklabels=classes, cmap=cmap)
#plt.savefig('confusion-matrix.png')
plot_confusion_matrix(cm, ['NONE', 'PARTIAL', 'TOTAL'])
xgbc = xgboost.XGBClassifier(learning_rate=0.5,
n_estimators=120,
max_depth=3,
gamma=0,
reg_lambda=1,
subsample=1,
colsample_bytree=0.75,
scale_pos_weight=1,
objective='multi:softprob',
num_class=3,random_state=42)
model = xgbc.fit(X_train, y_train, eval_metric='mlogloss')
pred = model.predict(X_test)
proba = model.predict_proba(X_test)
from sklearn.metrics import classification_report
print(classification_report(y_test, pred))
plot_confusion_matrix(cm, ['NONE', 'PARTIAL', 'TOTAL'])
pd.Series(pred).value_counts()
y_test.value_counts()
proba
proba.max(axis=1)
# Classic feature attribution
Feature Importance by Weights
The number of times a feature is used to split the data across all trees.
pl.rcParams['figure.figsize'] = [20, 20]
xgboost.plot_importance(model)
pl.title("xgboost.plot_importance(model)")
pl.show()
We see above that the following features are important by weights
1- prod_mean_Bookings
2- prod_bookings
3- prod_bookings_per_year
4- total_mean_bookings
5- prod_days_since_last_agreement
Feature Importance by Cover
The number of times a feature is used to split the data across all trees weighted by the number of training data points that go through those splits.
xgboost.plot_importance(model, importance_type="cover")
We see above that the following features are important by cover
1- TERRIOTRY_OWNERAGENT
2- cust_prod_booking_trend
3- visits_trend
4- visits_3_year_change
5- users_3_year_change
Feature Importance by Gain
The average training loss reduction gained when using a feature for splitting.
xgboost.plot_importance(model, importance_type="gain")
pl.title('xgboost.plot_importance(model, importance_type="gain")')
pl.show()
We see above that the following features are important by gain
1- cust_prod_booking_trend
2- prod_num_agrmts_with_parents
3- prod_num_agrmnts
4- visits_trend
5- TERRIOTRY_OWNERAGENT
# this takes a minute or two since we are explaining over
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)
Visualize a single prediction Note that we use the "display values" data frame so we get nice strings instead of category codes.
for which_class in range(0,3):
display(shap.force_plot(explainer.expected_value[which_class], shap_values[which_class][1], X_display.iloc[1,:]))
Visualize many predictions To keep the browser happy we only visualize 1,000 individuals.
shap.force_plot(explainer.expected_value[1], shap_values[1][:10,:], X_display.iloc[:10,:])
Bar chart of mean importance This takes the average of the SHAP value magnitudes across the dataset and plots it as a simple bar chart.
from matplotlib import colors as plt_colors
# class names
classes = ['NONE', 'PARTIAL', 'TOTAL']
# set RGB tuple per class
colors = [(0, 0.5, 1), (1, 0.7, 0), (1, 0.5, 0)]
# get class ordering from shap values
class_inds = np.argsort([-np.abs(shap_values[i]).mean() for i in range(len(shap_values))])
# create listed colormap
cmap = plt_colors.ListedColormap(np.array(colors)[class_inds])
shap.summary_plot(shap_values, X_display, color=cmap, class_names=classes)
shap.summary_plot(shap_values[2], X_display)
shap.summary_plot(shap_values[1], X)
SHAP Summary Plot Rather than use a typical feature importance bar chart, we use a density scatter plot of SHAP values for each feature to identify how much impact each feature has on the model output for individuals in the validation dataset. Features are sorted by the sum of the SHAP value magnitudes across all samples. It is interesting to note that the relationship feature has more total model impact than the captial gain feature, but for those samples where capital gain matters it has more impact than age. In other words, capital gain effects a few predictions by a large amount, while age effects all predictions by a smaller amount.
Note that when the scatter points don't fit on a line they pile up to show density, and the color of each point represents the feature value of that individual.
keep_features = ['prod_num_agrmts', 'prod_mean_bookings', 'prod_days_since_last_agreement',
'prod_days_since_first_agreement',
'total_mean_bookings', 'num_activities_Virtual',
'gerd_churn_year', 'CHURN_TYPE']
simple_sample = sample[keep_features]
simple_sample_display = sample_display[keep_features]
X = simple_sample.loc[:, simple_sample.columns != 'CHURN_TYPE']
y = simple_sample['CHURN_TYPE']
X_display = simple_sample_display.loc[:, simple_sample_display.columns != 'CHURN_TYPE']
y_display = simple_sample_display['CHURN_TYPE']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)
xgbc = xgboost.XGBClassifier(learning_rate=0.5,
n_estimators=120,
max_depth=4,
min_child_weight=0,
gamma=0,
reg_lambda=1,
subsample=1,
colsample_bytree=0.75,
scale_pos_weight=1,
objective='multi:softprob',
num_class=3,
random_state=42)
start_time = time.time()
model = xgbc.fit(X_train, y_train, eval_metric='mlogloss')
print(f'Model training completed in {round((time.time() - start_time)/60, 2)} mins for {len(X_train)} samples')
pred = model.predict(X_test)
proba = model.predict_proba(X_test)
from sklearn.metrics import classification_report
print(classification_report(y_test, pred))
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, pred)
cm
def plot_confusion_matrix(cm, classes, normalized=True, cmap='bone'):
plt.figure(figsize=[7, 6])
norm_cm = cm
if normalized:
norm_cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
sns.heatmap(norm_cm, annot=cm, fmt='g', xticklabels=classes, yticklabels=classes, cmap=cmap)
#plt.savefig('confusion-matrix.png')
plot_confusion_matrix(cm, ['NONE', 'PARTIAL', 'TOTAL'])
pd.Series(pred).value_counts()
y_test.value_counts()
# this takes a minute or two since we are explaining over 30 thousand samples in a model with over a thousand trees
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)
shap_values
X.columns
shap_df = pd.DataFrame(np.concatenate(shap_values))
shap_df.columns = X.columns
shap_df
Visualize a single prediction Note that we use the "display values" data frame so we get nice strings instead of category codes.
You can visualise how every feature and its values affects the prediction how the feature is pushing it higher or lower and the size of effect
shap.force_plot(explainer.expected_value[0], shap_values[0][1], X_display.iloc[1,:])
for which_class in range(0,3):
display(shap.force_plot(explainer.expected_value[which_class], shap_values[which_class][1], X_display.iloc[1,:]))
Visualize many predictions To keep the browser happy we only visualize 1,000 individuals.
shap.force_plot(explainer.expected_value[2], shap_values[2][:1000,:], X_display.iloc[:1000,:])
Bar chart of mean importance This takes the average of the SHAP value magnitudes across the dataset and plots it as a simple bar chart.
from matplotlib import colors as plt_colors
# class names
classes = ['NONE', 'PARTIAL', 'TOTAL']
# set RGB tuple per class
colors = [(0, 0.5, 1), (1, 0.7, 0), (1, 0.5, 0)]
# get class ordering from shap values
class_inds = np.argsort([-np.abs(shap_values[i]).mean() for i in range(len(shap_values))])
# create listed colormap
cmap = plt_colors.ListedColormap(np.array(colors)[class_inds])
shap.summary_plot(shap_values, X_display, color=cmap, class_names=classes)
Variable most significat for predicting class 2 i.e. Total Churn
shap.summary_plot(shap_values[2], X)
Variable most significat for predicting class 1 i.e. Partial Churn
shap.summary_plot(shap_values[1], X)
While a SHAP summary plot gives a general overview of each feature a SHAP dependence plot show how the model output varies by feauture value. Note that every dot is a customer x product record, and the vertical dispersion at a single feature value results from interaction effects in the model. The feature used for coloring is automatically chosen to highlight what might be driving these interactions.
keep_features.remove('CHURN_TYPE')
for this_feature in keep_features:
# print(f' SHAP partial dependence plot for {this_feature} on NO_CHURN')
# shap.dependence_plot(this_feature, shap_values[0], X, display_features=X_display)
print(f' SHAP partial dependence plot for {this_feature} on TOTAL_CHURN')
shap.dependence_plot(this_feature, shap_values[2], X, display_features=X_display)
print(f' SHAP partial dependence plot for {this_feature} on PARTIAL_CHURN')
shap.dependence_plot(this_feature, shap_values[1], X, display_features=X_display)
# Serialise and save model
pickle.dump(model, open("../models/solutions_xgb.pickle.dat", "wb"))